Introduction

Description of the Dataset:

The dataset can be found at https://www.kaggle.com/avikasliwal/used-cars-price-prediction#train-data.csv

It has following parameters:

Name: The brand and model of the car.
Location: The location in which the car is being sold or is available for purchase.
Year: The year or edition of the model.
Kilometers_Driven: The total kilometres driven in the car by the previous owner(s) in KM.
Fuel_Type: The type of fuel used by the car. (Petrol / Diesel / Electric / CNG / LPG)
Transmission: The type of transmission used by the car. (Automatic / Manual)
Owner_Type: Whether the ownership is Firsthand, Second hand or other.
Mileage: The standard mileage offered by the car company in kmpl or km/kg
Engine: The displacement volume of the engine in cc.
Power: The maximum power of the engine in bhp.
Seats: The number of seats in the car.
New_Price: The price of a new car of the same model.
Price: The price of the used car in INR Lakhs.

The price prediction for used cars is a very important part of used car business. Predicting prices accurately can help businesses and customers to settle deal at a fair price. Vehicle price prediction especially when the vehicle is used and not coming direct from the factory, is both a critical and important task. With increase in demand for used cars and upto 8 percent decrease in demand for the new cars in 2013,more and more vehicle buyers are finding alternatives of buying new cars outright.

Methods

#reading data from the csv
library(readr)
train_data = read_csv("./train-data.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   Name = col_character(),
##   Location = col_character(),
##   Year = col_double(),
##   Kilometers_Driven = col_double(),
##   Fuel_Type = col_character(),
##   Transmission = col_character(),
##   Owner_Type = col_character(),
##   Mileage = col_character(),
##   Engine = col_character(),
##   Power = col_character(),
##   Seats = col_double(),
##   New_Price = col_character(),
##   Price = col_double()
## )
test_data = read_csv("./test-data.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   Name = col_character(),
##   Location = col_character(),
##   Year = col_double(),
##   Kilometers_Driven = col_double(),
##   Fuel_Type = col_character(),
##   Transmission = col_character(),
##   Owner_Type = col_character(),
##   Mileage = col_character(),
##   Engine = col_character(),
##   Power = col_character(),
##   Seats = col_double(),
##   New_Price = col_character()
## )

Cleaning the dataset

The first thing we did was to clean the dataset that we are given. In the dataset, the values for Power, Mileage and Engine are added as strings with their units along with them. So we had to remove those units and convert their types to numeric.

Variables like Location,Transmission, Fuel_Type and Owner_Type can be considered as factor variables therefore they were changed to factor variables.

Columns for X1 and Name were removed as they were not really affecting the price.

#clean data function
clean_data = function(data) {
  #removing New  Price column
  data = data[,-13]
  remove_units = function(values) {
    remove_units_value =  function(value) {
      as.numeric(sub("\\s+\\D+$", "", value))
    }
    unlist(lapply(values, remove_units_value))
  }
  
  data$Location = factor(data$Location)
  data$Transmission = factor(data$Transmission)
  data$Fuel_Type = factor(data$Fuel_Type)
  data$Owner_Type = factor(data$Owner_Type)
  data$Power = remove_units(data$Power)
  data$Mileage = remove_units(data$Mileage)
  data$Engine = remove_units(data$Engine)
  data = na.omit(data)
  data = data[-which(data$Mileage == 0),]
  
  #removing Name and X1 columnns from the dataset
  data = data[,-c(1,2)]
  
  return(data)
}

train_data = clean_data(train_data)
test_data = clean_data(test_data)

Splitting the dataset

We split our train_data into trn_data and tst_data. The test_data provided in the file with the dataset is for competition and does not include any Price paramter

smp_size = floor(0.75 * nrow(train_data))

## set the seed to make your partition reproducible
set.seed(42)
train_ind = sample(seq_len(nrow(train_data)), size = smp_size)

trn_data = train_data[train_ind, ]
tst_data = train_data[-train_ind, ]

head(trn_data)
## # A tibble: 6 x 11
##   Location  Year Kilometers_Driv… Fuel_Type Transmission Owner_Type Mileage
##   <fct>    <dbl>            <dbl> <fct>     <fct>        <fct>        <dbl>
## 1 Pune      2017            48000 Diesel    Manual       First         28.4
## 2 Kochi     2016            39854 Petrol    Manual       First         18  
## 3 Pune      2007            59000 Petrol    Manual       Third         15.3
## 4 Jaipur    2015            66000 Diesel    Manual       First         20.1
## 5 Bangalo…  2014            62000 Petrol    Manual       Second        17.4
## 6 Mumbai    2012            23152 Petrol    Manual       First         15.6
## # … with 4 more variables: Engine <dbl>, Power <dbl>, Seats <dbl>,
## #   Price <dbl>

Correlation between parameters

cor(trn_data[c("Kilometers_Driven","Mileage","Engine","Power")])
##                   Kilometers_Driven     Mileage      Engine       Power
## Kilometers_Driven        1.00000000 -0.05894872  0.09081845  0.04318149
## Mileage                 -0.05894872  1.00000000 -0.65082506 -0.55709084
## Engine                   0.09081845 -0.65082506  1.00000000  0.86371639
## Power                    0.04318149 -0.55709084  0.86371639  1.00000000

We can see from the correlation matrix that Power and Engine are very much correlated. Also Engine and Mileage also upto some extent. We will have to be careful before using all of them for our model.

Plotting relationships between the Price and other paramters

plot(Price ~ Power, data = trn_data, 
     col = "dodgerblue",
     main = "Price and Power relation")

plot(log(Price) ~ Power, data = trn_data, 
     col = "dodgerblue",
     main = "log(Price) and Power relation")

plot(log(Price) ~ log(Power), data = trn_data, 
     col = "dodgerblue",
     main = "log(Price) and log(Power) relation")

plot(Price ~ Mileage, data = trn_data, col = "dodgerblue",
     main = "log(Price) and log(Mileage) relation")

plot(log(Price) ~ log(Mileage), data = trn_data, col = "dodgerblue",
     main = "log(Price) and log(Mileage) relation")

plot(Price ~ Engine, data = trn_data, col = "dodgerblue",
     main = "log(Price) and log(Engine) relation")

plot(log(Price) ~ log(Engine), data = trn_data, col = "dodgerblue",
     main = "log(Price) and log(Engine) relation")

plot(Price ~ Kilometers_Driven, data = trn_data, col = "dodgerblue",
     main = "log(Price) and log(Engine) relation")

plot(log(Price) ~ log(Kilometers_Driven), data = trn_data, col = "dodgerblue",
     main = "log(Price) and log(Engine) relation")

From these plots we were inferred that the logs of the above parameters are better suited for regression.

Functions to check Assumptions

plot_fit_res = function(model, title = "") {
  plot(fitted(model), resid(model), col = "grey", pch = 20,
  xlab = "Fitted", ylab = "Residuals", main = title)
  abline(h = 0, col = "darkorange", lwd = 2)
}
plot_qq = function(model, title = "") {
  qqnorm(resid(model), main = title, col = "darkgrey")
  qqline(resid(model), col = "dodgerblue", lwd = 2)
}
plot_fit_pred = function(model, data, title = "") {
  predicted = predict(model, newdata = data)
  plot(predicted, log(data$Price), col = "grey", pch = 20,
  xlab = "Predicted", ylab = "Actual", main = title)
  abline(a = 0, b = 1, col = "orange", lwd = 3)
}

Functions to compare different models

calc_aic = function(model) {
  rss = sum(resid(model)^2)
  n = length(resid(model))
  p = length(model$coefficients)
  return(n*log(rss/n)+2*p)
}
calc_bic = function(model) {
  rss = sum(resid(model)^2)
  n = length(resid(model))
  p = length(model$coefficients)
  return(n*log(rss/n)+(log(n)*p))
}
calc_loocv_rmse = function(model) {
  return(sqrt(mean((resid(model) / (1 - hatvalues(model))) ^ 2)))
}
calc_adj_r2 = function(model) {
  mod_sum = summary(model)
  return(mod_sum[["adj.r.squared"]])
}
calc_rmse = function(model, data) {
  y_hat = exp(predict(model, newdata = data))
  y = data$Price
  return(sqrt(mean((y_hat - y)^2)))
}
calc_quality_criterions = function(model) {
  return(data.frame("AIC" = calc_aic(model),
                    "BIC" = calc_bic(model),
                    "LOOCV_RMSE" = calc_loocv_rmse(model),
                    "ADJ_R_SQ" = calc_adj_r2(model)
                    ))
}

Creating and testing different models

The first model we created was an additive model usig all the parameters.

model1 = lm(Price ~ ., data = trn_data)
calc_quality_criterions(model1)
##        AIC      BIC LOOCV_RMSE ADJ_R_SQ
## 1 15346.78 15500.03   6.484846  0.71617
plot_fit_res(model1, title = "Residuals vs fitted plot for Model1")

plot_qq(model1, title = "QQ plot for Model1")

As we have already seen from the plots that it is better to use the log for numeric parameters we also created the model using the logs for Power, Mileage, Engine and Kilometers driven.

model2 = lm(log(Price) ~ Location+Year+log(Kilometers_Driven)+Fuel_Type+Transmission+Owner_Type+log(Mileage)+log(Engine)+log(Power)+Seats, data = trn_data)
calc_quality_criterions(model2)
##         AIC       BIC LOOCV_RMSE ADJ_R_SQ
## 1 -11072.54 -10919.29   0.283115 0.892384
plot_fit_res(model2, title = "Residuals vs fitted plot for Model2")

plot_qq(model2, title = "QQ plot for Model2")

We were able to improve all the criterias by just using log. AIC, BIC and LOOCV_RMSE decreased and ADJ_R_SQ increased.

We tried to further decrease the AIC and BIC by using the Backward search on the model2.

model3 = step(model2, direction = "backward")
## Start:  AIC=-11072.54
## log(Price) ~ Location + Year + log(Kilometers_Driven) + Fuel_Type + 
##     Transmission + Owner_Type + log(Mileage) + log(Engine) + 
##     log(Power) + Seats
## 
##                          Df Sum of Sq    RSS      AIC
## - Seats                   1     0.016 346.66 -11074.3
## <none>                                346.64 -11072.5
## - log(Mileage)            1     2.029 348.67 -11049.0
## - log(Engine)             1     2.908 349.55 -11037.9
## - Owner_Type              3     4.231 350.87 -11025.4
## - log(Kilometers_Driven)  1    11.149 357.79 -10935.8
## - Fuel_Type               3    28.222 374.86 -10735.5
## - Transmission            1    43.677 390.32 -10554.4
## - Location               10    50.604 397.24 -10495.3
## - log(Power)              1   106.952 453.59  -9895.9
## - Year                    1   241.819 588.46  -8755.0
## 
## Step:  AIC=-11074.33
## log(Price) ~ Location + Year + log(Kilometers_Driven) + Fuel_Type + 
##     Transmission + Owner_Type + log(Mileage) + log(Engine) + 
##     log(Power)
## 
##                          Df Sum of Sq    RSS      AIC
## <none>                                346.66 -11074.3
## - log(Mileage)            1     2.110 348.77 -11049.7
## - log(Engine)             1     3.033 349.69 -11038.1
## - Owner_Type              3     4.231 350.89 -11027.2
## - log(Kilometers_Driven)  1    11.284 357.94 -10935.9
## - Fuel_Type               3    28.858 375.52 -10729.9
## - Transmission            1    45.754 392.41 -10533.0
## - Location               10    50.661 397.32 -10496.5
## - log(Power)              1   121.792 468.45  -9756.6
## - Year                    1   254.504 601.16  -8663.4
n = length(resid(model2))
model4 = step(model2, direction = "backward", k = log(n))
## Start:  AIC=-10919.29
## log(Price) ~ Location + Year + log(Kilometers_Driven) + Fuel_Type + 
##     Transmission + Owner_Type + log(Mileage) + log(Engine) + 
##     log(Power) + Seats
## 
##                          Df Sum of Sq    RSS      AIC
## - Seats                   1     0.016 346.66 -10927.5
## <none>                                346.64 -10919.3
## - log(Mileage)            1     2.029 348.67 -10902.1
## - Owner_Type              3     4.231 350.87 -10891.3
## - log(Engine)             1     2.908 349.55 -10891.1
## - log(Kilometers_Driven)  1    11.149 357.79 -10788.9
## - Fuel_Type               3    28.222 374.86 -10601.4
## - Transmission            1    43.677 390.32 -10407.5
## - Location               10    50.604 397.24 -10405.9
## - log(Power)              1   106.952 453.59  -9749.0
## - Year                    1   241.819 588.46  -8608.1
## 
## Step:  AIC=-10927.47
## log(Price) ~ Location + Year + log(Kilometers_Driven) + Fuel_Type + 
##     Transmission + Owner_Type + log(Mileage) + log(Engine) + 
##     log(Power)
## 
##                          Df Sum of Sq    RSS      AIC
## <none>                                346.66 -10927.5
## - log(Mileage)            1     2.110 348.77 -10909.3
## - Owner_Type              3     4.231 350.89 -10899.5
## - log(Engine)             1     3.033 349.69 -10897.7
## - log(Kilometers_Driven)  1    11.284 357.94 -10795.5
## - Fuel_Type               3    28.858 375.52 -10602.1
## - Location               10    50.661 397.32 -10413.5
## - Transmission            1    45.754 392.41 -10392.5
## - log(Power)              1   121.792 468.45  -9616.2
## - Year                    1   254.504 601.16  -8522.9
calc_quality_criterions(model4)
##         AIC       BIC LOOCV_RMSE  ADJ_R_SQ
## 1 -11074.33 -10927.47  0.2830213 0.8924036

Using model4 we were able to reduce the LOOCV_RMSE and the ADJ_R_SQ further with minimal effect on AIC.

From the correlation testng earlier we had found that Engine and Power are highly corrlated. This means we should also try by removing the Engine Parameter and see if it improves anything.

model5 = lm(log(Price) ~ Location+Year+log(Kilometers_Driven)+Fuel_Type+Transmission+Owner_Type+log(Mileage)+log(Power)+Seats, data = trn_data)
model5 = step(model5, direction = "backward", k = log(n))
## Start:  AIC=-10891.05
## log(Price) ~ Location + Year + log(Kilometers_Driven) + Fuel_Type + 
##     Transmission + Owner_Type + log(Mileage) + log(Power) + Seats
## 
##                          Df Sum of Sq    RSS      AIC
## - Seats                   1      0.14 349.69 -10897.7
## <none>                                349.55 -10891.1
## - Owner_Type              3      4.32 353.87 -10862.4
## - log(Mileage)            1      6.73 356.28 -10815.8
## - log(Kilometers_Driven)  1     11.04 360.59 -10763.1
## - Transmission            1     41.95 391.50 -10402.7
## - Location               10     50.74 400.29 -10380.8
## - Fuel_Type               3     54.72 404.27 -10278.8
## - Year                    1    239.64 589.18  -8611.1
## - log(Power)              1    329.71 679.26  -7987.6
## 
## Step:  AIC=-10897.67
## log(Price) ~ Location + Year + log(Kilometers_Driven) + Fuel_Type + 
##     Transmission + Owner_Type + log(Mileage) + log(Power)
## 
##                          Df Sum of Sq    RSS      AIC
## <none>                                349.69 -10897.7
## - Owner_Type              3      4.33 354.02 -10868.9
## - log(Mileage)            1     10.79 360.48 -10772.9
## - log(Kilometers_Driven)  1     10.91 360.60 -10771.4
## - Transmission            1     43.17 392.86 -10395.9
## - Location               10     50.66 400.35 -10388.6
## - Fuel_Type               3     68.40 418.09 -10139.8
## - Year                    1    255.21 604.90  -8504.1
## - log(Power)              1    339.76 689.45  -7930.7
calc_quality_criterions(model5)
##         AIC       BIC LOOCV_RMSE ADJ_R_SQ
## 1 -11038.15 -10897.67  0.2841992 0.891487
plot_fit_res(model5, title = "Residuals vs fitted plot for Model5")

plot_qq(model5, title = "QQ plot for Model5")

plot_fit_pred(model5, trn_data, title = "Predicted vs Actual for Model5")

We then also tried some interaction models

model6 = lm(log(Price) ~ (Location+Year+log(Kilometers_Driven)+Fuel_Type+Transmission+Owner_Type+log(Mileage)+log(Engine)+log(Power)+Seats)^2, data = trn_data)
model7 = step(model6, direction = "backward", trace = FALSE)
calc_quality_criterions(model7)
##         AIC       BIC LOOCV_RMSE  ADJ_R_SQ
## 1 -12124.09 -11268.43        Inf 0.9177862
plot_fit_res(model7, title = "Residuals vs fitted plot for Model7")

plot_qq(model7, title = "QQ plot for Model7")

plot_fit_pred(model7, trn_data, title = "Predicted vs Actual for Model7")
## Warning in predict.lm(model, newdata = data): prediction from a rank-
## deficient fit may be misleading

Using model7 we were further able to decrease the AIC and BIC from the additive models.

Results

We were able to find 2 models that were giving us very good results. One is additive and another is an interaction model.

Additive Model

model_add = model5
calc_quality_criterions(model_add)
##         AIC       BIC LOOCV_RMSE ADJ_R_SQ
## 1 -11038.15 -10897.67  0.2841992 0.891487
plot_fit_res(model_add, title = "Residuals vs fitted plot for Additive model")

plot_qq(model_add, title = "QQ plot for Additive model")

plot_fit_pred(model_add, trn_data, title = "Predicted vs Actual for Additive Model For train data")

plot_fit_pred(model_add, tst_data, title = "Predicted vs Actual for Additive Model For test data")

calc_rmse(model_add, tst_data)
## [1] 5.799698

Interaction Model

model_int = model7
calc_quality_criterions(model_int)
##         AIC       BIC LOOCV_RMSE  ADJ_R_SQ
## 1 -12124.09 -11268.43        Inf 0.9177862
plot_fit_res(model_int, title = "Residuals vs fitted plot for Interaction model")

plot_qq(model_int, title = "QQ plot for Interaction model")

plot_fit_pred(model_int, trn_data, title = "Predicted vs Actual for Interaction model for train data")

plot_fit_pred(model_add, tst_data, title = "Predicted vs Actual for Interaction Model For test data")

calc_rmse(model_int, tst_data)
## [1] 5.583086

Discussion

In context of the data, both the models use the log(Price) as the response which as shown earlier gives a better result. Parameters like Name and ID were removed from both the models. The numerical values like Kilometers_driven, Engine, Power and Mileage were shown to waork bettter when used with log. There were many dummy parameters that were used to accomodate the factor variables like Owner_Type, Transmission, Location.

The AIC and BIC values for both the models are less which is good. The LOOCV_RMSE values are also less. The adjusted r squared values are close to 1. The residual vs fitted plot shows that both the models follow the equal assumption to some extent and the qq plots shows that the models also follow normality assumption.

The Interaction model has less RMSE than Additive model, whcih means that the interaction model gives better results than the Additive model.

Appendinx

library(readr)

#reading data from the csv
train_data = read_csv("./train-data.csv")
test_data = read_csv("./test-data.csv")

#clean data function
clean_data = function(data) {
  #removing New  Price column
  data = data[,-13]
  remove_units = function(values) {
    remove_units_value =  function(value) {
      as.numeric(sub("\\s+\\D+$", "", value))
    }
    unlist(lapply(values, remove_units_value))
  }
  data$Location = factor(data$Location)
  data$Transmission = factor(data$Transmission)
  data$Fuel_Type = factor(data$Fuel_Type)
  data$Owner_Type = factor(data$Owner_Type)
  data$Power = remove_units(data$Power)
  data$Mileage = remove_units(data$Mileage)
  data$Engine = remove_units(data$Engine)
  data = na.omit(data)
  data = data[-which(data$Mileage == 0),]
  data = data[,-c(1,2)]
  return(data)
}

loocv_rmse = function(model) {
  sqrt(mean((resid(model) / (1 - hatvalues(model))) ^ 2))
}

train_data = clean_data(train_data)
test_data = clean_data(test_data)

pairs(train_data[c("Power","Mileage","Engine","Price")])

plot(log(Price) ~ log(Power), data = train_data)
train_data_nm = train_data
mileage_transform = function(x) {
  return(-x)
}
plot(log(Price) ~ mileage_transform(Mileage), data = train_data_nm)

plot(log(Price) ~ log(Engine), data = train_data)

train_data_nm = train_data
train_data_nm = train_data[-which.max(train_data$Kilometers_Driven),]
plot(log(Price) ~ log(Kilometers_Driven), data = train_data_nm)

train_data = train_data[,-c(1,2)]
model = lm(Price ~ ., data =  train_data)
model_aic = step(model, direction = "backward")
anova(model_aic, model)

log_model = lm(log(Price) ~ .,data =  train_data)
log_model_aic = step(log_model, direction = "backward")

loocv_rmse(log_model)
loocv_rmse(log_model_aic)
sqrt(mean(resid(log_model_aic)^2))

sqrt(mean(resid(model_aic)^2))
loocv_rmse(model_aic)


log_model_int = lm(log(Price) ~ (.)^2, data = train_data)
log_model_int_aic = step(log_model_int, direction = "backward")
loocv_rmse(log_model_int_aic)
anova(log_model_aic, log_model_int_aic)
sqrt(mean(resid(log_model_int_aic)^2))

price_test_hat = exp(predict(log_model_int_aic, newdata = test_data))
train_data = clean_data(train_data)
test_data = clean_data(test_data)

smp_size = floor(0.75 * nrow(train_data))

## set the seed to make your partition reproducible
set.seed(42)
train_ind = sample(seq_len(nrow(train_data)), size = smp_size)

trn_data = train_data[train_ind, ]
tst_data = train_data[-train_ind, ]

plot_fit_res = function(model, title = "") {
  plot(fitted(model), resid(model), col = "grey", pch = 20,
  xlab = "Fitted", ylab = "Residuals", main = title)
  abline(h = 0, col = "darkorange", lwd = 2)
}
plot_qq = function(model, title = "") {
  qqnorm(resid(model), main = title, col = "darkgrey")
  qqline(resid(model), col = "dodgerblue", lwd = 2)
}
plot_fit_pred = function(model, data, title = "") {
  predicted = predict(model, newdata = data)
  plot(predicted, log(data$Price), col = "grey", pch = 20,
  xlab = "Predicted", ylab = "Actual", main = title)
  abline(a = 0, b = 1, col = "orange", lwd = 3)
}
calc_aic = function(model) {
  rss = sum(resid(model)^2)
  n = length(resid(model))
  p = length(model$coefficients)
  return(n*log(rss/n)+2*p)
}
calc_bic = function(model) {
  rss = sum(resid(model)^2)
  n = length(resid(model))
  p = length(model$coefficients)
  return(n*log(rss/n)+(log(n)*p))
}
calc_loocv_rmse = function(model) {
  return(sqrt(mean((resid(model) / (1 - hatvalues(model))) ^ 2)))
}
calc_adj_r2 = function(model) {
  mod_sum = summary(model)
  return(mod_sum[["adj.r.squared"]])
}
calc_rmse = function(model, data) {
  y_hat = exp(predict(model, newdata = data))
  y = data$Price
  return(sqrt(mean((y_hat - y)^2)))
}
calc_quality_criterions = function(model) {
  return(data.frame("AIC" = calc_aic(model),
                    "BIC" = calc_bic(model),
                    "LOOCV_RMSE" = calc_loocv_rmse(model),
                    "ADJ_R_SQ" = calc_adj_r2(model)
                    ))
}